#!/usr/bin/python27
#coding=utf8
import urllib2,sys
from bs4 import BeautifulSoup
import re

"""
从 http://newcar.xcar.com.cn/price/
抓取 汽车品牌数据
分为三级目录， 大品牌，厂商，类型     先只是写到文件
网页区分 <a href="/price/pb   <a href="/price/b  <a href="/
Author:Yyb
Date:20170228
Email:yangyingbo@unimlink
"""

def main():
    html_url = "http://newcar.xcar.com.cn/price/"
    res = urllib2.urlopen(html_url)
    data = res.read().decode('gbk')
    car_map = dict()
    #print(data)
    soup = BeautifulSoup(data, 'html.parser')
    res.close()
    tbody = soup.find_all("tbody")
    #print(tbody[0])
    for tbody_i in tbody:
        soup_1 = BeautifulSoup(str(tbody_i), 'html.parser')
        dalei = soup_1.find_all("a")
        for dalei_i in dalei:
            # print 'dalei',dalei_i
            if re.match('^<a href="/price/pb\d*/".*>', str(dalei_i)):
                # print dalei_i
                soup_2 = BeautifulSoup(str(dalei_i), 'html.parser')
                key_dalei = str(soup_2.span).replace('<span>', '').replace('</span>', '')
                car_map[key_dalei] = None
                map_erlei = {}
                # print key_dalei
            elif re.match('^<a href="/price/b\d*', str(dalei_i)):
                relei_tmp = re.sub('<.*">', '', str(dalei_i))
                relei = re.sub('<.*>', '', relei_tmp)
                map_erlei[relei]=None
                car_map[key_dalei] = map_erlei
                list_sanlei = []
                map_erlei[relei] = list_sanlei
                # print '    ' + relei
            elif re.match('^<a href="/\d*/', str(dalei_i)):
                soup_3 = BeautifulSoup(str(dalei_i), 'html.parser')
                jjss = ''
                if ('即将上市' in str(soup_3.span)) or ('未上市' in str(soup_3.span)) or ('停售' in str(soup_3.span)):
                    jjss_tmp = re.sub('<span.*=', '', str(soup_3.span))
                    # print jjss_tmp
                    jjss = re.sub('>.*/span>', '', jjss_tmp)
                    jjss = jjss.replace('"', '')
                    jjss = jjss.replace('\n', '')
                    # print jjss
                sanlei_tmp = re.sub('<.*">', '', str(soup_3))
                sanlei = re.sub('<.*>', '', sanlei_tmp)
                sanlei = sanlei.replace('\n', '')
                list_sanlei.append(jjss + sanlei)
                # print '        '+ jjss + sanlei
    sql_list = print_car_map(car_map)
    run_Sql(sql_list)

def print_car_map(car_map):
    list_sql = []
    for i in car_map.keys():
        print i
        for j in car_map[i].keys():
            print '    ' + str(j)
            for x in car_map[i][j]:
                print '        ' + x
                sql="""insert into cvm_cx (pp,cs,cx) values('%s','%s','%s')
                """ % (str(i), str(j), str(x))
                list_sql.append(sql)
    return list_sql

def run_Sql(list_sql):
    from cvm.common.RunSql import RunSQl
    runSql = RunSQl()
    # runSql.run_SqlList(list_sql)
    runSql.create_sqllist_file(list_sql)


if __name__ == '__main__':
    main()